Use the requests module to make a HTTP request to http://www.tripadvisor.com
In [ ]:
import requests
url = 'http://www.tripadvisor.com/'
response = requests.get(url)
print(response.status_code)
#print(response.headers)
In [ ]:
import requests
url = 'http://www.tripadvisor.com/robots.txt'
response = requests.get(url)
if response.status_code == 200:
print(response.status_code)
print(response.text)
else:
print('Failed to get a response from the url. Error code: ',resp.status_code )
In [ ]:
import requests
url = 'http://tripadvisor.com'
response = requests.get(url)
if response.status_code == 200:
print(response.status_code)
print(response.text)
else:
print('Failed to get a response from the url. Error code: ',resp.status_code )
Sometimes, you may want a little bit of information - a movie rating, stock price, or product availability - but the information is available only in HTML pages, surrounded by ads and extraneous content.
To do this we build an automated web fetcher called a crawler or spider. After the HTML contents have been retrived from the remote web servers, a scraper parses it to find the needle in the haystack.
The bs4 module can be used for searching a webpage (HTML file) and pulling required data from it. It does three things to make a HTML page searchable-
This module takes the HTML page and creates four kinds of objects: Tag, NavigableString, BeautifulSoup, and Comment.
Read more about BeautifulSoup : https://www.crummy.com/software/BeautifulSoup/bs4/doc/
In [ ]:
<h1 id="HEADING" property="name" class="heading_name ">
<div class="heading_height"></div>
"
Le Jardin Napolitain
"
</h1>
Let us write the code to parse a html page. We will use the trip advisor URL for an infamous restaurant - https://www.tripadvisor.com/Restaurant_Review-g187147-d1751525-Reviews-Cafe_Le_Dome-Paris_Ile_de_France.html
In [ ]:
import requests
from bs4 import BeautifulSoup
scrape_url = 'https://www.tripadvisor.com/Restaurant_Review-g187147-d1751525-Reviews-Cafe_Le_Dome-Paris_Ile_de_France.html'
response = requests.get(scrape_url)
print(response.status_code)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser') # Soup
print(soup.prettify)
In [ ]:
<div class="entry">
<p class="partial_entry">
Popped in on way to Eiffel Tower for lunch, big mistake.
Pizza was disgusting and service was poor.
It’s a shame Trip Advisor don’t let you score venues zero....
<span class="taLnk ulBlueLinks" onclick="widgetEvCall('handlers.clickExpand',event,this);">More
</span>
</p>
</div>
Let us try and find all the < p > (paragraph) tags in the soup:
In [ ]:
import requests
from bs4 import BeautifulSoup
def scrapecontent(url):
"""This function parses the HTML page representing the url using the BeautifulSoup module
and returns the created python readable data structure (soup)"""
scrape_response = requests.get(url)
print(scrape_response.status_code)
if scrape_response.status_code == 200:
soup = BeautifulSoup(scrape_response.text, 'html.parser')
return soup
else:
print('Error accessing url : ',scrape_response.status_code)
return None
def main():
scrape_url = 'https://www.tripadvisor.com/Restaurant_Review-g187147-d1751525-Reviews-Cafe_Le_Dome-Paris_Ile_de_France.html'
ret_soup = scrapecontent(scrape_url)
if ret_soup:
for review in ret_soup.find_all('p', class_='partial_entry'):
print(review.text) #We are interested only in the text data, since the reviews are stored as text
main()
In [ ]:
import requests
from bs4 import BeautifulSoup
def scrapecontent(url):
"""This function parses the HTML page representing the url using the BeautifulSoup module
and returns the created python readable data structure (soup)"""
scrape_response = requests.get(url)
print(scrape_response.status_code)
if scrape_response.status_code == 200:
soup = BeautifulSoup(scrape_response.text, 'html.parser')
return soup
else:
print('Error accessing url : ',scrape_response.status_code)
return None
def main():
page_no = 0
while(page_no < 60):
scrape_url = 'https://www.tripadvisor.com/Restaurant_Review-g187147-d1751525-Reviews-or'+str(page_no)+'-Cafe_Le_Dome-Paris_Ile_de_France.html'
ret_soup = scrapecontent(scrape_url)
if ret_soup:
for review in ret_soup.find_all('p', class_='partial_entry'):
print(review.text) #We are interested only in the text data, since the reviews are stored as text
page_no = page_no + 10
main()
Using yesterdays sentiment analysis code and the corpus of sentiment found in the word_sentiment.csv file, calculate the sentiment of the reviews.
In [ ]:
#Enter your code here
In [ ]:
import requests
from bs4 import BeautifulSoup
def scrapecontent(url):
"""This function parses the HTML page representing the url using the BeautifulSoup module
and returns the created python readable data structure (soup)"""
scrape_response = requests.get(url)
print(scrape_response.status_code)
if scrape_response.status_code == 200:
soup = BeautifulSoup(scrape_response.text, 'html.parser')
return soup
else:
print('Error accessing url : ',scrape_response.status_code)
return None
def main():
scrape_url = 'https://www.tripadvisor.com/Restaurant_Review-g187147-d1751525-Reviews-Cafe_Le_Dome-Paris_Ile_de_France.html'
ret_soup = scrapecontent(scrape_url)
if ret_soup:
for rev_data in ret_soup.find_all('div', class_= 'review-container'):
date = rev_data.find('span', class_ ='ratingDate')# Get the date if the review
print(date.text)
review = rev_data.find('p') # Get the review text
print(review.text)
rating = rev_data.find('span',class_='ui_bubble_rating') #Get the rating of the review
print(int(rating['class'][1][7:])/10)
main()
Using the review data and the ratings available is there any way we can improve the corpus of sentiments "word_sentiment.csv" file?
In [ ]: